# Import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, auc, roc_curve, roc_auc_score

# Load dataset into a dataframe and display the first few rows of the dataframe
df = pd.read_csv("HR_dataset.csv")
df.head(5)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     14999 non-null  float64
 1   last_evaluation        14999 non-null  float64
 2   number_project         14999 non-null  int64  
 3   average_montly_hours   14999 non-null  int64  
 4   time_spend_company     14999 non-null  int64  
 5   Work_accident          14999 non-null  int64  
 6   left                   14999 non-null  int64  
 7   promotion_last_5years  14999 non-null  int64  
 8   Department             14999 non-null  object 
 9   salary                 14999 non-null  object 
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB

df.describe(include='all')

# Display all column names
df.columns

Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident', 'left',
       'promotion_last_5years', 'Department', 'salary'],
      dtype='object')

# Rename columns as needed
rename_dict = {'Department':'department', 'Work_accident':'work_accident', 'promotion_last_5years':'promotion_last_5_years', 'average_montly_hours' : 'average_monthly_hours'}
df.rename(mapper=rename_dict, axis=1, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   satisfaction_level      14999 non-null  float64
 1   last_evaluation         14999 non-null  float64
 2   number_project          14999 non-null  int64  
 3   average_monthly_hours   14999 non-null  int64  
 4   time_spend_company      14999 non-null  int64  
 5   work_accident           14999 non-null  int64  
 6   left                    14999 non-null  int64  
 7   promotion_last_5_years  14999 non-null  int64  
 8   department              14999 non-null  object 
 9   salary                  14999 non-null  object 
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB

# Change work_accident, left and promotion_last_5_years to categorical for purposes of data exploration
df['work_accident'] = df['work_accident'].astype('category')
df['left'] = df['left'].astype('category')
df['promotion_last_5_years'] = df['promotion_last_5_years'].astype('category')

df.isna().sum()

satisfaction_level        0
last_evaluation           0
number_project            0
average_monthly_hours     0
time_spend_company        0
work_accident             0
left                      0
promotion_last_5_years    0
department                0
salary                    0
dtype: int64

df_dups = df[df.duplicated(keep=False) == True]
df_dups.sort_values(by = ['satisfaction_level','last_evaluation','time_spend_company', 'department']).head(6)

# Drop duplicates and save resulting dataframe in a new variable as needed
df.drop_duplicates(inplace=True)

# Display first few rows of new dataframe
df.head()

import pandas as pd

def calculate_outlier_table(df, columns):
    """Return a DataFrame summarizing outlier percentages for specified columns.

    Args:
        df (pd.DataFrame): The DataFrame containing the data.
        columns (list of str): List of column names to check for outliers.

    Returns:
        pd.DataFrame: Table with outlier percentages for each column.
    """
    results = []
    for col in columns:
        if col in df.columns:
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR

            total = len(df)
            left_outliers = df[df[col] < lower_bound]
            right_outliers = df[df[col] > upper_bound]

            left_pct = (len(left_outliers) / total) * 100 if total > 0 else 0
            right_pct = (len(right_outliers) / total) * 100 if total > 0 else 0
            total_pct = left_pct + right_pct

            results.append({
                'column': col,
                'outlier_percentage': total_pct,
                'left_outlier_percentage': left_pct,
                'right_outlier_percentage': right_pct
            })
        else:
            results.append({
                'column': col,
                'outlier_percentage': None,
                'left_outlier_percentage': None,
                'right_outlier_percentage': None
            })

    return pd.DataFrame(results)

numerical_features = []

categorical_features = []

for col in df.columns:
    if (df[col].dtype == 'int64') or (df[col].dtype == 'float64'):
        numerical_features.append(col)
    else: 
        categorical_features.append(col)

print(calculate_outlier_table(df, numerical_features))

                  column  outlier_percentage  left_outlier_percentage  \
0     satisfaction_level            0.000000                      0.0   
1        last_evaluation            0.000000                      0.0   
2         number_project            0.000000                      0.0   
3  average_monthly_hours            0.000000                      0.0   
4     time_spend_company            6.871821                      0.0   

   right_outlier_percentage  
0                  0.000000  
1                  0.000000  
2                  0.000000  
3                  0.000000  
4                  6.871821

# Create a boxplot to visualize distribution of `tenure` and detect any outliers
sns.boxplot(data=df, x='time_spend_company')
plt.show()

sns.boxplot(data=df, x='time_spend_company', y='department')
plt.show()

sns.histplot(data=df, x='time_spend_company', bins=4, binrange=(2,10), hue='left')

<Axes: xlabel='time_spend_company', ylabel='Count'>

len(df[df.time_spend_company >= 4]) / len(df)

0.32449337002752066

df.groupby('left', observed=False).mean(numeric_only=True)

df.groupby('department').mean(numeric_only=True)

sns.histplot(data=df, x='average_monthly_hours', hue='left')

<Axes: xlabel='average_monthly_hours', ylabel='Count'>

def plot_feature_distributions(data, target, features):
    """
    Plots boxplot and histplot side by side for each feature in the given list.
    
    Parameters:
        data (pd.DataFrame): The DataFrame containing the features.
        features (list): List of feature names to plot.
    """
    num_features = len(features)
    
    # Set figure size dynamically based on the number of features
    fig, axes = plt.subplots(num_features, 2, figsize=(12, 4 * num_features))
    
    if num_features == 1:
        axes = [axes]  # Ensure axes is iterable when there's only one feature
    
    for i, feature in enumerate(features):
        sns.boxplot(x=data[feature], hue=data[target], ax=axes[i][0])
        axes[i][0].set_title(f'Boxplot of {feature}')
        
        sns.histplot(data[feature], kde=True, ax=axes[i][1])
        axes[i][1].set_title(f'Histogram of {feature}')
    
    plt.tight_layout()
    plt.show()


def plot_cat_features(data, target, features):
    """
    Plots boxplot and histplot side by side for each feature in the given list.
    
    Parameters:
        data (pd.DataFrame): The DataFrame containing the features.
        features (list): List of feature names to plot.
    """
    num_features = len(features)
    
    # Set figure size dynamically based on the number of features
    fig, axes = plt.subplots(num_features, 2, figsize=(12, 4 * num_features))
    
    if num_features == 1:
        axes = [axes]  # Ensure axes is iterable when there's only one feature
    
    for i, feature in enumerate(features):
        sns.barplot(x=data[feature], y=data[target], ax=axes[i][0])
        axes[i][0].set_title(f'Barplot of {feature}')
        
        sns.histplot(data[feature], kde=False, ax=axes[i][1])
        axes[i][1].set_title(f'Histogram of {feature}')
    
    plt.tight_layout()
    plt.show()

plot_feature_distributions(df, 'left', numerical_features)

plot_cat_features(df, 'left', categorical_features)

# Plot a correlation heatmap
plt.figure(figsize=(16, 9))
heatmap = sns.heatmap(df.corr(numeric_only=True), vmin=-1, vmax=1, annot=True, cmap=sns.color_palette("vlag", as_cmap=True))
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':14}, pad=12);

df1 = pd.get_dummies(df, columns=['salary' ,'department'], drop_first=True)

X = df1.drop(['left'], axis=1)
y = df1['left']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, stratify=y, random_state=11)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

nummerical = ['satisfaction_level', 'last_evaluation', 'number_project', 'average_monthly_hours', 'time_spend_company']

X_train_scaled= scaler.fit_transform(X_train[nummerical])
X_train_scaled = pd.DataFrame(X_train_scaled, columns=nummerical, index=X_train.index)
X_train_scaled = pd.concat([X_train_scaled, X_train[X_train.columns[~X_train.columns.isin(nummerical)]]], axis=1)
X_train_scaled.head(5)

X_test_scaled= scaler.fit_transform(X_test[nummerical])
X_test_scaled = pd.DataFrame(X_test_scaled, columns=nummerical, index=X_test.index)
X_test_scaled = pd.concat([X_test_scaled, X_test[X_test.columns[~X_test.columns.isin(nummerical)]]], axis=1)
X_test_scaled.head(5)

lg = LogisticRegression(random_state=42)
tree = DecisionTreeClassifier(random_state=42)
rf = RandomForestClassifier(random_state=12)
gb = GradientBoostingClassifier(random_state=12)

models = [lg, tree, rf, gb]

num_models = len(models)
cols = 2  # Number of columns in the grid
rows = math.ceil(num_models / cols)

fig, axes = plt.subplots(rows, cols, figsize=(15, 5*rows))
axes = axes.flatten()

for i, model in enumerate(models):
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    cm = confusion_matrix(y_test, y_pred, labels=model.classes_)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
    disp.plot(ax=axes[i])
    axes[i].set_title(f"Confusion Matrix for {model.__class__.__name__}")

# Remove unused subplots
for j in range(i+1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

def evaluate_models(models, X_test, y_test):
    """
    Evaluates multiple trained models on test data and prints a comparison table.
    
    Parameters:
        models (dict): A dictionary where keys are model names and values are trained sklearn models.
        X_test (pd.DataFrame or np.array): Test feature set.
        y_test (pd.Series or np.array): True labels for the test set.
    """
    results = []
    
    for name, model in models.items():
        y_pred = model.predict(X_test)
        y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
        
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='binary')
        recall = recall_score(y_test, y_pred, average='binary')
        f1 = f1_score(y_test, y_pred, average='binary')
        roc_auc = roc_auc_score(y_test, y_prob) if y_prob is not None else "N/A"
        
        results.append([name, accuracy, precision, recall, f1, roc_auc])
    
    # Create DataFrame for better visualization
    results_df = pd.DataFrame(results, columns=["Model", "Accuracy", "Precision", "Recall", "F1-Score", "ROC-AUC"])
    print(results_df)

# Usage:
trained_models = {
                  "Decision Tree": tree,
                  "Random Forest": rf, 
                  "Logistic Regression": lg, 
                  "GradientBoost": gb
                 }

evaluate_models(trained_models, X_test_scaled, y_test)

                 Model  Accuracy  Precision    Recall  F1-Score   ROC-AUC
0        Decision Tree  0.960640   0.892562  0.867470  0.879837  0.923335
1        Random Forest  0.976651   0.981982  0.875502  0.925690  0.984909
2  Logistic Regression  0.828552   0.466942  0.226908  0.305405  0.810240
3        GradientBoost  0.973983   0.956522  0.883534  0.918580  0.987375

### Building a Random Forest Model
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV

params = {
        'n_estimators': [100, 150, 200, 250, 300],
        'max_depth': [3, 5, 7, 13],
}

rf = RandomForestClassifier(random_state=42)

rf_grid = GridSearchCV(
    estimator=rf,
    param_grid=params, 
    scoring=['roc_auc', 'accuracy', 'f1'],
    refit='f1',
    cv=5,
    verbose=0
)

rf_grid.fit(X_train_scaled, y_train)

print("")
print(f'The best random forest model features: {rf_grid.best_params_}, with a score of {rf_grid.best_score_}')
print("")

# Get predictions and confusion matrix
y_pred = rf_grid.predict(X_test_scaled)
cm = confusion_matrix(y_test, y_pred, labels=rf_grid.classes_)

# Get top 10 feature importances
feat_impt = rf_grid.best_estimator_.feature_importances_
ind = np.argpartition(feat_impt, -10)[-10:]
feat = X.columns[ind]
feat_impt_top = feat_impt[ind]
feat_df_sorted = pd.DataFrame({"Feature": feat, "Importance": feat_impt_top}).sort_values("Importance")

# Create side-by-side plots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=rf_grid.classes_)
disp.plot(ax=ax1)
ax1.set_title("Confusion Matrix")

# Feature importance
feat_df_sorted.plot(kind='barh', x="Feature", y="Importance", ax=ax2, legend=False)
ax2.set_title("Gradient Boosting: Feature Importances for Employee Leaving")
ax2.set_xlabel("Importance")
ax2.set_ylabel("Feature")

plt.tight_layout()
plt.show()

The best random forest model features: {'max_depth': 13, 'n_estimators': 100}, with a score of 0.9447511631209704

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

params = {
        'n_estimators': [100, 150, 200],
        'max_depth': [3, 5, 7, 13],
        'learning_rate': [1, .1, .01],
        'min_samples_leaf': [.9, 1]
}

gb = GradientBoostingClassifier(random_state=42)

gb_grid = GridSearchCV(
    estimator=gb,
    param_grid=params, 
    scoring=['roc_auc', 'accuracy', 'f1'],
    refit='f1',
    cv=5,
    verbose=0
)

gb_grid.fit(X_train_scaled, y_train)

print("")
print(f'The best gradietnt boosting model features: {gb_grid.best_params_}, with a score of {gb_grid.best_score_}')
print("")

# Get predictions and confusion matrix
y_pred = gb_grid.predict(X_test_scaled)
cm = confusion_matrix(y_test, y_pred, labels=gb_grid.classes_)

# Get top 10 feature importances
feat_impt = gb_grid.best_estimator_.feature_importances_
ind = np.argpartition(feat_impt, -10)[-10:]
feat = X.columns[ind]
feat_impt_top = feat_impt[ind]
feat_df_sorted = pd.DataFrame({"Feature": feat, "Importance": feat_impt_top}).sort_values("Importance")

# Create side-by-side plots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=gb_grid.classes_)
disp.plot(ax=ax1)
ax1.set_title("Confusion Matrix")

# Feature importance
feat_df_sorted.plot(kind='barh', x="Feature", y="Importance", ax=ax2, legend=False)
ax2.set_title("Gradient Boosting: Feature Importances for Employee Leaving")
ax2.set_xlabel("Importance")
ax2.set_ylabel("Feature")

plt.tight_layout()
plt.show()

The best gradietnt boosting model features: {'learning_rate': 0.01, 'max_depth': 7, 'min_samples_leaf': 1, 'n_estimators': 100}, with a score of 0.9461263304457613

	satisfaction_level	last_evaluation	number_project	average_montly_hours	time_spend_company	left	Department	salary
0	0.38	0.53	2	157	3	1	sales	low
1	0.80	0.86	5	262	6	1	sales	medium
2	0.11	0.88	7	272	4	1	sales	medium
3	0.72	0.87	5	223	5	1	sales	low
4	0.37	0.52	2	159	3	1	sales	low

	satisfaction_level	last_evaluation	number_project	average_montly_hours	time_spend_company	Work_accident	left	promotion_last_5years	Department	salary
count	14999.000000	14999.000000	14999.000000	14999.000000	14999.000000	14999.000000	14999.000000	14999.000000	14999	14999
unique	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	10	3
top	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	sales	low
freq	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	4140	7316
mean	0.612834	0.716102	3.803054	201.050337	3.498233	0.144610	0.238083	0.021268	NaN	NaN
std	0.248631	0.171169	1.232592	49.943099	1.460136	0.351719	0.425924	0.144281	NaN	NaN
min	0.090000	0.360000	2.000000	96.000000	2.000000	0.000000	0.000000	0.000000	NaN	NaN
25%	0.440000	0.560000	3.000000	156.000000	3.000000	0.000000	0.000000	0.000000	NaN	NaN
50%	0.640000	0.720000	4.000000	200.000000	3.000000	0.000000	0.000000	0.000000	NaN	NaN
75%	0.820000	0.870000	5.000000	245.000000	4.000000	0.000000	0.000000	0.000000	NaN	NaN
max	1.000000	1.000000	7.000000	310.000000	10.000000	1.000000	1.000000	1.000000	NaN	NaN

	satisfaction_level	last_evaluation	number_project	average_monthly_hours	time_spend_company	left	department	salary
30	0.09	0.62	6	294	4	1	accounting	low
12030	0.09	0.62	6	294	4	1	accounting	low
14241	0.09	0.62	6	294	4	1	accounting	low
71	0.09	0.77	5	275	4	1	product_mng	medium
12071	0.09	0.77	5	275	4	1	product_mng	medium
14282	0.09	0.77	5	275	4	1	product_mng	medium

	satisfaction_level	last_evaluation	number_project	average_monthly_hours	time_spend_company	left	department	salary
0	0.38	0.53	2	157	3	1	sales	low
1	0.80	0.86	5	262	6	1	sales	medium
2	0.11	0.88	7	272	4	1	sales	medium
3	0.72	0.87	5	223	5	1	sales	low
4	0.37	0.52	2	159	3	1	sales	low

	satisfaction_level	last_evaluation	number_project	average_monthly_hours	time_spend_company
left
0	0.667365	0.715667	3.786800	198.94270	3.262000
1	0.440271	0.721783	3.883476	208.16223	3.881467

I. Background¶

II. Data Preparation - Inspect and Clean¶

Check for any missing values in the data.¶

Check for any duplicate entries in the data.¶

Examining outliers¶

III. Explore Data and Relationships¶

Data visualizations¶

IV. Modeling¶

Explore Baseline Models¶

Fine Tuning Parameters¶

Building a Gradient Boosting Model¶

V. Conclusion, Recommendations, Next Steps¶

	satisfaction_level	last_evaluation	number_project	average_monthly_hours	time_spend_company
department
IT	0.634016	0.715051	3.797131	200.638320	3.350410
RandD	0.627176	0.712983	3.850144	201.291066	3.319885
accounting	0.607939	0.721900	3.834138	200.877617	3.404187
hr	0.621947	0.715691	3.675541	199.371048	3.256240
management	0.631995	0.726307	3.837156	201.529817	3.981651
marketing	0.634770	0.718440	3.720654	199.487370	3.421991
product_mng	0.629825	0.713790	3.794461	198.893586	3.341108
sales	0.631349	0.710398	3.777092	200.242050	3.380673
support	0.634822	0.722998	3.820977	200.627128	3.292696
technical	0.627937	0.719791	3.859180	201.115419	3.309269

	satisfaction_level	last_evaluation	number_project	average_monthly_hours	time_spend_company	work_accident	salary_low	salary_medium	department_RandD	department_accounting	department_hr	department_management	department_marketing	department_product_mng	department_sales	department_support	department_technical
3670	-0.586807	1.204042	1.031037	-0.500513	-0.272195	0	True	False	False	False	False	False	True	False	False	False	False
892	0.449539	1.619914	0.170436	1.551623	1.210005	0	True	False	False	False	False	False	False	False	False	True	False
5262	0.200816	1.501093	1.031037	0.135649	0.468905	1	True	False	True	False	False	False	False	False	False	False	False
8218	-1.830423	0.788170	1.891639	0.197213	0.468905	0	False	True	False	False	False	False	False	False	False	True	False
6934	0.698262	-0.043575	0.170436	-0.008000	-1.013295	0	True	False	False	False	False	False	False	False	False	False	False

	satisfaction_level	last_evaluation	number_project	average_monthly_hours	time_spend_company	work_accident	salary_low	salary_medium	department_RandD	department_accounting	department_hr	department_management	department_marketing	department_product_mng	department_sales	department_support	department_technical
7750	-0.016480	-1.275368	0.166662	-0.055929	-0.281343	1	True	False	False	False	False	False	False	False	False	False	True
10014	1.522283	-0.740813	1.023698	1.278413	-0.281343	0	False	False	False	False	False	False	False	False	True	False	False
8462	-0.266009	1.694382	-0.690374	1.463168	-1.068158	0	False	True	False	False	False	False	False	False	False	False	False
4562	-0.224421	0.268902	-0.690374	-0.117514	-0.281343	0	False	True	False	False	False	False	False	False	False	False	True
5090	0.732108	-0.800208	-0.690374	-0.076457	-0.281343	0	False	True	False	False	False	False	False	False	False	True	False